--- title: Core keywords: fastai sidebar: home_sidebar summary: "Core functionality for the fastai audio library." ---

Audio Signals

AudioGetter

This section regroups the basic types used in vision with the transform that create objects of those types.

get_audio_files[source]

get_audio_files(path, recurse=True, folders=None)

Get image files in path recursively, only in folders, if specified.

AudioGetter[source]

AudioGetter(suf='', recurse=True, folders=None)

Create get_image_files partial function that searches path suffix suf and passes along kwargs, only in folders, if specified.

p = Config()['data_path'] / 'ST-AEDS-20180100_1-OS'
untar_data(URLs.SPEAKERS10, fname=str(p)+'.tar', dest=p)
PosixPath('/home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/ST-AEDS-20180100_1-OS')
p.ls()
(#3844) [/home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/f0004_us_f0004_00446.wav,/home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/m0002_us_m0002_00128.wav,/home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/f0003_us_f0003_00279.wav,/home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/f0001_us_f0001_00168.wav,/home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/f0005_us_f0005_00286.wav,/home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/m0005_us_m0005_00282.wav,/home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/f0005_us_f0005_00432.wav,/home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/f0005_us_f0005_00054.wav,/home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/m0004_us_m0004_00110.wav,/home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/m0003_us_m0003_00180.wav...]
audio_get_func = AudioGetter("", recurse=True, folders=None)
files = audio_get_func(p)
#files will load differently on different machines so we specify examples by name
ex_files = [p/f for f in ['m0005_us_m0005_00218.wav', 
                                'f0003_us_f0003_00279.wav', 
                                'f0001_us_f0001_00168.wav', 
                                'f0005_us_f0005_00286.wav',]]

AudioItem

class AudioItem[source]

AudioItem(iterable=()) :: tuple

Built-in immutable sequence.

If no argument is given, the constructor returns an empty tuple. If iterable is specified the tuple is initialized from iterable's items.

If the argument is a tuple, the return value is the same object.

show_audio_signal[source]

show_audio_signal(ai, ctx, **kwargs)

type(AudioItem((None, None, ex_files[0])))
__main__.AudioItem
item0 = AudioItem.create(ex_files[0])
item0.sig.shape
torch.Size([1, 58240])
item0.sr, item0.nchannels, item0.nsamples, item0.duration
(16000, 1, 58240, 3.64)
test_eq(type(item0.sig), torch.Tensor)
test_eq(item0.sr, 16000)
test_eq(item0.nchannels, 1)
test_eq(item0.nsamples, 58240)
test_eq(item0.duration, 3.64)
item0[0]
tensor([[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ..., -9.1553e-05,
         -6.1035e-05,  0.0000e+00]])
item0.show()
File: /home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/m0005_us_m0005_00218.wav
item1 = AudioItem.create(files[1]);
item0.show()
item1.show()
File: /home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/m0005_us_m0005_00218.wav
File: /home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/m0002_us_m0002_00128.wav
#get 3 equal length portions of 3 different signals so we can stack them
#for a fake multichannel example
ai0, ai1, ai2 = map(AudioItem.create, ex_files[1:4]);
min_samples = min(ai0.nsamples, ai1.nsamples, ai2.nsamples)
s0, s1, s2 = map(lambda x: x[:,:min_samples], (ai0.sig, ai1.sig, ai2.sig))
test_eq(s0.shape, s1.shape)
test_eq(s1.shape, s2.shape)
fake_multichannel = AudioItem((torch.stack((s0, s1, s2), dim=1).squeeze(0), 16000, None))
test_eq(fake_multichannel.nchannels, 3)
test_eq(fake_multichannel.nsamples, 53760)
fake_multichannel.show()
File: None

class OpenAudio[source]

OpenAudio(items) :: Transform

Delegates (__call__,decode,setup) to (encodes,decodes,setups) if split_idx matches

repr of Transform is:
classname: self.use_as_item {self.encodes} {self.decodes}
encodes and decodes are TypeDispatches whose reprs are str of dict where k/v pair is typename and function that handles that type

oa = OpenAudio(files); oa
OpenAudio: True (object,object) -> encodes (object,object) -> decodes
#demonstrate functionality of OpenAudio.encodes, the rest of the nb will
#use files that are opened by name for reproducibility/testing
oa = OpenAudio(files)
item100 = oa.encodes(100)
item100.show()
File: /home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/m0003_us_m0003_00340.wav
#test open audio on a random set of files
for i in range(10):
    idx = random.randint(0, len(files))
    test_eq_type(oa.encodes(idx), AudioItem.create(files[idx]))
    test_eq_type(oa.decodes(idx), files[idx])
type(oa)
__main__.OpenAudio
oa.encodes(0)
(tensor([[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ..., -1.5259e-04,
          -6.1035e-05, -1.8311e-04]]),
 16000,
 PosixPath('/home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/f0004_us_f0004_00446.wav'))
oa.decodes(0)
PosixPath('/home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/f0004_us_f0004_00446.wav')
oa.items[0]
PosixPath('/home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/f0004_us_f0004_00446.wav')

Create functions to wrap TorchAudio

Note:
If a function (e.g. specshow) accepts kwargs, this wont pass extra arguments because specshow doesnt accept all kwargs, and will break if you pass in unexpected ones, but we have no way of knowing what functions they delegate to and pulling out the relevant kwargs, so if there is something we know it accepts as a kwarg like "cmap" we need to pass it in manually
Note:
Add func only works if all args are keyword arguments, doesnt work for unnamed args. Could add in a get usable args that checks if default is inspect._empty. This also needs more tests

get_usable_kwargs takes a function and a dictionary of kwargs that may or may not be relevant to that function and returns a dictionary of all the default values to that function, updated with the kwargs that can be successfully applied. This is done because, first it allows us to combine multiple functions into a single AudioToSpec Transform but only pass the appropriate kwargs, secondly because it allows us to keep a dictionary of the settings used to create the Spectrogram which is sometimes used in it's display and cropping, and third because it allows us to warn the user when they are passing in improper or unused kwargs.

get_usable_kwargs[source]

get_usable_kwargs(func, kwargs, exclude)

kwargs = {'a':1, 'b':2}
extra_kwargs = {'z':0, 'a':1, 'b':2, 'c':3}
test_eq(get_usable_kwargs(operator.add,       kwargs, []), kwargs)
test_eq(get_usable_kwargs(operator.add, extra_kwargs, []), kwargs)

Audio Spectrograms

Note:
Overriding getattr to store the settings isnt ideal, but if we dump them all in as attributes by doing `x.__dict__.update(settings)` we then can't easily pass settings when we do a transform and create a new AudioSpectrogram objct. Potential fixes are
1. Having both a settings dict and updating the dict with all its attributes (this feels dirty)
2. Finding a way to implement deepcopy for AudioSpectrogram so that we can clone it efficiently
3. Dumping the spectrogram settings and having a method that collects them so it can be passed to the constructor when we make a new AudioSpectrogram object in a transform

AudioSpectrogram Class

class AudioSpectrogram[source]

AudioSpectrogram(x, **kwargs) :: TensorImageBase

TO-DO:
1. Get colorbar and axes working for multiplot display
2. Have someone who knows matplotlib better cleanup/refactor
3. Plotting the spectrogram forces it to a uniform size, we may want to display either the shape of the image, or display it to scale with something like plt.figure(figsize=(sg.width/30, sg.height/30))

show_spectrogram[source]

show_spectrogram(sg, ax, ctx, figsize, **kwargs)

Note:
_validate and _warn_kwargs should probably be abstracted up a level, they dont belong to AudioToSpec class and could be useful to check args in general.

Spectrogram Generation: AudioToSpec

class AudioToSpec[source]

AudioToSpec(mel=True, to_db=True, stype='power', top_db=None, **kwargs) :: Transform

Delegates (__call__,decode,setup) to (encodes,decodes,setups) if split_idx matches

Display and Testing

# get a sg with weird settings for testing
a2s = AudioToSpec(f_max=20000, n_mels=137)
sg = a2s(item0)
sg2 = a2s(item100)
sg_mc = a2s(fake_multichannel)
sg.show()
sg.show()
sg2.show()
sg_mc.show()
sg._settings
{'sample_rate': 16000,
 'n_fft': 1024,
 'win_length': 1024,
 'hop_length': 512,
 'f_min': 0.0,
 'f_max': 20000,
 'pad': 0,
 'n_mels': 137,
 'window_fn': <function _VariableFunctions.hann_window>,
 'wkwargs': None,
 'stype': 'power',
 'top_db': None,
 'transformer': Sequential(
   (0): MelSpectrogram(
     original_name=MelSpectrogram
     (spectrogram): Spectrogram(original_name=Spectrogram)
     (mel_scale): MelScale(original_name=MelScale)
   )
   (1): AmplitudeToDB(original_name=AmplitudeToDB)
 ),
 'to_db': True,
 'mel': True,
 'sr': 16000,
 'nchannels': 1,
 'path': PosixPath('/home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/m0005_us_m0005_00218.wav')}
sg.nchannels, sg.height, sg.width
(1, 137, 114)
#test the explicit settings were properly stored in the spectrogram object and can be accessed as attributes
test_eq(sg.f_max, 20000)
test_eq(sg.hop_length, 512)
test_eq(sg.sr, item100.sr)
test_eq(sg.mel, True)
test_eq(sg.to_db, True)
test_eq(sg.nchannels, 1)
test_eq(sg.height, 137)
test_eq(sg.n_mels, sg.height)
test_eq(sg.width, 114)
defaults = {k:v.default for k, v in inspect.signature(_GenMelSpec).parameters.items()}
a2s = AudioToSpec(f_max=20000, hop_length = 345)
sg = a2s(item100)
test_eq(sg.n_mels, defaults["n_mels"])
test_eq(sg.n_fft , 1024)
test_eq(sg.shape[1], sg.n_mels)
test_eq(sg.hop_length, 345)
# test the spectrogram and audio have same duration, both are computed
# on the fly as transforms can change their duration
test_close(sg.duration, item100.duration, eps=0.1)

Test warnings for missing/extra arguments

SHOW_W=False
#test warning for unused argument 'power' for melspec
#tests AudioToSpec and its from_cfg class method
voice_mel_cfg = {'n_fft':2560, 'f_max':22050., 'n_mels':128, 'hop_length':256, 'power':2}
test_warns(lambda: AudioToSpec(**voice_mel_cfg), show=SHOW_W)
test_warns(lambda: AudioToSpec.from_cfg(voice_mel_cfg), show=SHOW_W)
#test for unused arguments 'f_max' and 'n_mels' for non-mel Spectrogram
voice_mel_cfg = {'f_max':22050., 'n_mels':128, 'n_fft':2560, 'hop_length':256, 'power':2}
test_warns(lambda: AudioToSpec(mel=False, **voice_mel_cfg), show=SHOW_W)
#test warning for unused argument 'top_db' when db conversion not done
voice_mel_cfg = {'top_db':20, 'n_fft':2560, 'f_max':22050., 'n_mels':128, 'hop_length':256}
test_warns(lambda: AudioToSpec(to_db=False, **voice_mel_cfg), show=SHOW_W)
#test warning for invalid argument 'doesntexist'
voice_mel_cfg = {'doesntexist':True, 'n_fft':2560, 'f_max':22050., 'n_mels':128, 'hop_length':256}
test_warns(lambda: AudioToSpec(to_db=False, **voice_mel_cfg), show=SHOW_W)

AudioToSpec Timing Tests

a_to_db_mel = AudioToSpec()
a_to_nondb_mel = AudioToSpec(to_db=False)
a_to_db_nonmel = AudioToSpec(mel=False)
a_to_nondb_non_mel = AudioToSpec(mel=False, to_db=False)
a_to_db_mel_hyperparams = AudioToSpec(n_fft=8192, hop_length=128)
%%timeit -n10
a_to_db_mel(item0)
The slowest run took 4.22 times longer than the fastest. This could mean that an intermediate result is being cached.
1.68 ms ± 1.26 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
%%timeit -n10
a_to_nondb_mel(item0)
1.41 ms ± 966 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
%%timeit -n10
a_to_db_nonmel(item0)
1.58 ms ± 250 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
%%timeit -n10
a_to_nondb_non_mel(item0)
1.33 ms ± 180 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
%%timeit -n10
# Time can blow up as a factor of n_fft and hop_length. n_fft is best kept to a power of two, hop_length
# doesn't matter except smaller = more time because we have more chunks to perform STFTs on
a_to_db_mel_hyperparams(item0)
21.2 ms ± 1.25 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

AudioToSpec Timing Tests as audio length scales

import time
def time_variable_length_audios(f, max_seconds=30, sr=16000, channels=1):
    times = []
    audios = [AudioItem((torch.randn(channels, sr*i), sr, None)) for i in range(1,max_seconds+1,2)]
    for a in audios:
        start = time.time()
        out = f(a)
        end = time.time()
        times.append(round(1000*(end-start), 2))
    return times
%%time
a2s = AudioToSpec()
max_seconds = 180
times_mono = time_variable_length_audios(f=a2s, max_seconds=max_seconds)
times_stereo = time_variable_length_audios(f=a2s, max_seconds=max_seconds, channels=2)
plt.plot(np.arange(0,max_seconds,2), times_mono, label="mono")
plt.plot(np.arange(0,max_seconds,2), times_stereo, label="stereo")
plt.legend(['mono','stereo'])
plt.title("Time Taken by AudioToSpec")
plt.xlabel("Audio Duration in Seconds")
plt.ylabel("Processing Time in ms")
CPU times: user 23.4 s, sys: 1.84 s, total: 25.2 s
Wall time: 8.15 s
Text(0, 0.5, 'Processing Time in ms')

MFCC Generation

Issue:
MFCC is based on a melspectrogram so it accepts a bunch of the same arguments, but instead of passing them in explicitly, they are passed as a dict to "melkwargs". As a result, in the current state the mfcc has no current info about the hop_length (determines the width) that it was generated with. One option is grabbing the defaults from _GenMelSpec inside AudioToMFCC and pass it into the sg_settings. OTOH this could be an argument for lumping everything into AudioToSpec, including MFCC, and then we'd have the same access to _GenMelSpec arguments for tab-completion. We could also make AudioToMFCC have a 2nd delegation to _GenMelSpec, and then parse the MelSpec arguments ourselves and bundle them into melkwargs before passing them to torchaudio. This would break our concept of wrapping the external functions in internal references like _GenMelSpec, because we'd no longer be agnostic to how theyre implemented. One last note is that melkwargs will not accept extra keywords, only the ones that torchaudio.transforms.MelSpectrogram expects.

class AudioToMFCC[source]

AudioToMFCC(sample_rate=16000, n_mfcc=40, dct_type=2, norm='ortho', log_mels=False, melkwargs=None) :: Transform

Delegates (__call__,decode,setup) to (encodes,decodes,setups) if split_idx matches

a2mfcc = AudioToMFCC()
mfcc = a2mfcc(item0)
test_eq(mfcc.n_mfcc, mfcc.data.shape[1])
mfcc.show()
mfcc._settings
{'sr': 16000,
 'nchannels': 1,
 'path': PosixPath('/home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/m0005_us_m0005_00218.wav'),
 'sample_rate': 16000,
 'n_mfcc': 40,
 'dct_type': 2,
 'norm': 'ortho',
 'log_mels': False,
 'melkwargs': None}
mfcc.height
40
mfcc.width
292
#n_mfcc specified should determine the height of the mfcc
n_mfcc = 67
a2mfcc67 = AudioToMFCC(n_mfcc=n_mfcc)
mfcc67 = a2mfcc67(item100)
test_eq(mfcc67.shape[1], n_mfcc)
print(mfcc67.shape)
mfcc67.show()
torch.Size([1, 67, 567])

Example of passing in melkwargs

a2mfcc_kwargs = AudioToMFCC(melkwargs={"hop_length":1024, "n_fft":1024})
mfcc_kwargs = a2mfcc_kwargs(item100)
mfcc_kwargs.show()
# make sure a new hop_length changes the resulting width
test_ne(mfcc_kwargs.width, mfcc.width)

MFCC Timing Tests

%%time
a2mfcc = AudioToMFCC()
max_seconds = 180
times_mono = time_variable_length_audios(f=a2mfcc, max_seconds=max_seconds)
times_stereo = time_variable_length_audios(f=a2mfcc, max_seconds=max_seconds, channels=2)
plt.plot(np.arange(0,max_seconds,2), times_mono, label="mono")
plt.plot(np.arange(0,max_seconds,2), times_stereo, label="stereo")
plt.legend(['mono','stereo'])
plt.title("Time Taken by AudioToMFCC")
plt.xlabel("Audio Duration in Seconds")
plt.ylabel("Processing Time in ms")
CPU times: user 27.1 s, sys: 2.45 s, total: 29.5 s
Wall time: 9.16 s
Text(0, 0.5, 'Processing Time in ms')

Example Pipelines

DB MelSpectrogram Pipe (Standard)

mel_cfg = {'n_fft':2560,'hop_length':64}
oa = OpenAudio(files)
a2s = AudioToSpec(**mel_cfg)
db_mel_pipe = Pipeline([oa,a2s], as_item=True)
for i in range(5):
    print("Shape:", db_mel_pipe(i).shape)
    db_mel_pipe.show(db_mel_pipe(i))
Shape: torch.Size([1, 128, 821])
Shape: torch.Size([1, 128, 1101])
Shape: torch.Size([1, 128, 1331])
Shape: torch.Size([1, 128, 841])
Shape: torch.Size([1, 128, 951])

Raw Spectrogram (non-mel, non-db) Pipe

cfg = {'hop_length':128, 'n_fft':400}
oa = OpenAudio(files)
db_mel_pipe = Pipeline([oa, AudioToSpec(mel=False, to_db=False, **cfg)], as_item=True)
for i in range(3):
    print("Shape:", db_mel_pipe(i).shape)
    db_mel_pipe.show(db_mel_pipe(i))
    test_eq(db_mel_pipe(i).hop_length, cfg["hop_length"])
Shape: torch.Size([1, 201, 411])
Shape: torch.Size([1, 201, 551])
Shape: torch.Size([1, 201, 666])

DBScale non-melspectrogram Pipe

oa = OpenAudio(files)
db_mel_pipe = Pipeline([oa, AudioToSpec(mel=False)], as_item=True)
for i in range(3): 
    print("Shape:", db_mel_pipe(i).shape)
    db_mel_pipe.show(db_mel_pipe(i))
Shape: torch.Size([1, 513, 103])
Shape: torch.Size([1, 513, 138])
Shape: torch.Size([1, 513, 167])

Pipe using from_cfg (config)

#non-mel db-scale spectrogram 
cfg = {'mel':False, 'n_fft':260, 'f_max':22050., 'hop_length':128}
oa = OpenAudio(files)
db_mel_pipe = Pipeline([oa, AudioToSpec.from_cfg(cfg)], as_item=True)
for i in range(3): 
    db_mel_pipe.show(db_mel_pipe(i))
/opt/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:61: UserWarning: f_max passed in but unused, your settings use <class 'torchaudio.transforms.Spectrogram'> not <class 'torchaudio.transforms.MelSpectrogram'>

MFCC Pipe

db_mfcc_pipe = Pipeline([oa, AudioToMFCC(n_mfcc=40),], as_item=True)
for i in range(3): 
    db_mfcc_pipe.show(db_mfcc_pipe(i))

AudioConfig Class

config_from_func[source]

config_from_func(func, name, **kwargs)

class AudioConfig[source]

AudioConfig()

# Basic Mel Spectrogram is just the Torchaudio defaults, which are currently bad, hence
# the empty melbins in the spectrogram below. We can make our own custom good ones like Voice
mel_cfg = AudioConfig.BasicMelSpectrogram()
a2mel = AudioToSpec.from_cfg(mel_cfg)
mel_bad = a2mel(oa(42))
mel_bad.show()
voice_cfg = AudioConfig.Voice()
a2mel = AudioToSpec.from_cfg(voice_cfg)
mel_good = a2mel(oa(42))
mel_good.show()
test_eq(mel_bad.n_fft, mel_cfg.n_fft)
# hop defaults to None in torchaudio but is set later in the code, we override this default to None
# internally in AudioToSpec to ensure the correct hop_length is stored as a sg attribute
test_ne(mel_bad.hop_length, mel_cfg.hop_length)
print("MelConfig Default Hop:", mel_cfg.hop_length)
print("Resulting Hop:",mel_bad.hop_length)
MelConfig Default Hop: None
Resulting Hop: 200
sg_cfg = AudioConfig.BasicSpectrogram()
# make sure mel setting is passed down and is false for normal spectro
test_eq(sg_cfg.mel, False)
#Grab a random file, test that the n_fft are passed successfully via config and stored in sg settings
oa = OpenAudio(files)
f_num = random.randint(0, len(files))
sg_cfg = AudioConfig.BasicSpectrogram(n_fft=2000, hop_length=155)
a2sg = AudioToSpec.from_cfg(sg_cfg)
sg = a2sg(oa(f_num))
test_eq(sg.n_fft, sg_cfg.n_fft)
test_eq(sg.width, int(oa(f_num).nsamples/sg_cfg.hop_length)+1)

Pipeline examples from Config

oa = OpenAudio(files)
db_mel_pipe = Pipeline([oa, AudioToSpec.from_cfg(sg_cfg)], as_item=True)
for i in range(3): 
    db_mel_pipe.show(db_mel_pipe(i))
voice_config = AudioConfig.Voice(); voice_config
Voice(sample_rate=16000, n_fft=1024, win_length=None, hop_length=128, f_min=50.0, f_max=8000.0, pad=0, n_mels=128, window_fn=<built-in method hann_window of type object at 0x7faeaff2d220>, wkwargs=None)
oa = OpenAudio(files)
db_mel_pipe = Pipeline([oa, AudioToSpec.from_cfg(voice_config)], as_item=True)
for i in range(3): 
    db_mel_pipe.show(db_mel_pipe(i))
mfcc_cfg = AudioConfig.BasicMFCC()
oa = OpenAudio(files)
mfcc_pipe = Pipeline([oa, AudioToMFCC.from_cfg(mfcc_cfg)], as_item=True)
for i in range(44,47):
    print("Shape", mfcc_pipe(i).shape)
    mfcc_pipe(i).show()
Shape torch.Size([1, 40, 183])
Shape torch.Size([1, 40, 490])
Shape torch.Size([1, 40, 260])

Export